In [1]:
# Read uspto dataset
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split

usptodata = pd.read_csv('U.S. Patents.csv')
usptodataset=usptodata[["grant_id","claims_text","abstract"]]
usptodataset= usptodataset.dropna()
usptodataset = usptodataset.reset_index(drop=True)

#Tokenize
usptodataset['tokenized_claims_text'] = usptodataset.apply(lambda row: nltk.word_tokenize(row['claims_text']), axis=1)
usptodataset.head()

#split data to have little data to run
# divide dataset to train and test
df_train, df_test = train_test_split(usptodataset, test_size=0.8, random_state=25)
df_train=df_train.reset_index(drop=True)
In [3]:
df_train
Out[3]:
grant_id claims_text abstract tokenized_claims_text
0 US10459019 1. An electromagnetic sensor comprising:,a fir... An electromagnetic sensor includes a first mag... [1, ., An, electromagnetic, sensor, comprising...
1 US10456083 1. A method for mapping somatosensory and moto... An apparatus for cortical mapping and method f... [1, ., A, method, for, mapping, somatosensory,...
2 US10461549 1. A method for charging a mobile terminal, th... The disclosure discloses a mobile terminal, a ... [1, ., A, method, for, charging, a, mobile, te...
3 US10462815 1. A method for a User Equipment (UE) operatin... The present invention relates to a wireless co... [1, ., A, method, for, a, User, Equipment, (, ...
4 US10458026 1. A method of producing graphene sheets compr... A method of producing graphene sheets comprisi... [1, ., A, method, of, producing, graphene, she...
... ... ... ... ...
1397 US10458777 1. A method of measuring a metrology target el... Targets, target elements and target design met... [1, ., A, method, of, measuring, a, metrology,...
1398 US10458022 1. A method for anti-corrosive treatment of me... A method for corrosion protection treatment, c... [1, ., A, method, for, anti-corrosive, treatme...
1399 US10462550 1. A storage device comprising:,a first case c... A storage device includes a first case, a seco... [1, ., A, storage, device, comprising, :, ,a, ...
1400 US10456037 1. A terminal device configured to be able to ... A terminal device is provided which is configu... [1, ., A, terminal, device, configured, to, be...
1401 US10461138 1. An organic light-emitting display device, c... An organic light-emitting display device and a... [1, ., An, organic, light-emitting, display, d...

1402 rows × 4 columns

In [46]:
dictionary = list(df_train.loc[:, "tokenized_claims_text"].values)
#dictionary[:2]
In [3]:
docs = list(df_train.loc[:, "claims_text"].values)
type(docs)
Out[3]:
list
In [41]:
#LDA with Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import en_core_web_sm 
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# LDA with octis
from octis.models.LDA import LDA



docs = list(df_train.loc[:, "claims_text"].values)
allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]

def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    lemmatized_docs = []
    for doc in docs:
        doc = nlp(doc)
        tokens = []
        for token in doc:
            if token.pos_ in allowed_postags:
                tokens.append(token.lemma_)
        lemmatized_docs.append(" ".join(tokens))
    return (lemmatized_docs)



def tokenize(docs):
    tokenized_docs = []
    for doc in docs:
        tokens = gensim.utils.simple_preprocess(doc, deacc=True)
        tokenized_docs.append(tokens)
    return (tokenized_docs)


# Pre-process input: lemmatization and tokenization
lemmatized_docs = lemmatize(docs)
tokenized_docs = tokenize(lemmatized_docs)

# Mapping from word IDs to words
id2word = corpora.Dictionary(tokenized_docs)

# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
    corpus.append(id2word.doc2bow(doc))
    
    
# Fit LDA model: 
lda_model = gensim.models.ldamodel.LdaModel(
    corpus = corpus,      # Document-Term Matrix
    id2word = id2word,    # Map word IDs to words
    num_topics = 30,      # Number of latent topics to extract
    random_state = 100,
    passes = 100,         # N° of passes through the corpus during training
    )

# Visualize with pyLDAvis: 
pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(
    lda_model, 
    corpus,
    id2word, 
    mds = "mmds", 
    R = 30)

visualization
C:\ProgramData\Anaconda3\lib\site-packages\pyLDAvis\_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
  default_term_info = default_term_info.sort_values(
Out[41]:
In [50]:
#!pip3 uninstall https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
In [56]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

lda_topics = topic_model.show_topics(num_words=5)

topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

for topic in lda_topics:
    print(topic)
    topics.append(preprocess_string(topic[1], filters))
print('----------------------------------------------------------------------------------')
[print(a) for a in topics];
(24, '0.085*"power" + 0.041*"charge" + 0.029*"claim" + 0.019*"mobile" + 0.018*"battery"')
(25, '0.050*"claim" + 0.032*"method" + 0.031*"comprise" + 0.030*"group" + 0.023*"composition"')
(6, '0.250*"say" + 0.037*"claim" + 0.025*"comprise" + 0.021*"method" + 0.017*"said"')
(22, '0.041*"location" + 0.025*"more" + 0.020*"claim" + 0.020*"base" + 0.020*"segment"')
(14, '0.132*"light" + 0.050*"device" + 0.043*"emit" + 0.021*"claim" + 0.018*"plurality"')
(20, '0.043*"gas" + 0.035*"engine" + 0.030*"claim" + 0.022*"method" + 0.021*"segment"')
(4, '0.071*"object" + 0.058*"area" + 0.045*"display" + 0.025*"datum" + 0.021*"point"')
(16, '0.068*"device" + 0.031*"datum" + 0.024*"claim" + 0.024*"network" + 0.021*"communication"')
(23, '0.038*"end" + 0.034*"member" + 0.026*"claim" + 0.024*"portion" + 0.021*"least"')
(11, '0.076*"least" + 0.062*"at" + 0.024*"ray" + 0.016*"claim" + 0.014*"device"')
----------------------------------------------------------------------------------
['power', 'charge', 'claim', 'mobile', 'battery']
['claim', 'method', 'comprise', 'group', 'composition']
['say', 'claim', 'comprise', 'method', 'said']
['location', 'more', 'claim', 'base', 'segment']
['light', 'device', 'emit', 'claim', 'plurality']
['gas', 'engine', 'claim', 'method', 'segment']
['object', 'area', 'display', 'datum', 'point']
['device', 'datum', 'claim', 'network', 'communication']
['end', 'member', 'claim', 'portion', 'least']
['least', 'at', 'ray', 'claim', 'device']
In [57]:
######################## NMF #################################
In [60]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]
docs = docs = list(df_train.loc[:, "claims_text"].values)

def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):

    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    lemmatized_docs = []
    for doc in docs:
        doc = nlp(doc)
        tokens = []
        for token in doc:
            if token.pos_ in allowed_postags:
                tokens.append(token.lemma_)
        lemmatized_docs.append(" ".join(tokens))
    return (lemmatized_docs)


def tokenize(docs):

    tokenized_docs = []
    for doc in docs:
        tokens = gensim.utils.simple_preprocess(doc, deacc=True)
        tokenized_docs.append(tokens)
    return (tokenized_docs)


# Pre-process input: lemmatization and tokenization
lemmatized_docs = lemmatize(docs)
tokenized_docs = tokenize(lemmatized_docs)

# Mapping from word IDs to words
id2word = corpora.Dictionary(tokenized_docs)

# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
    corpus.append(id2word.doc2bow(doc))

# Fit NMF model: See [1] for more details
nmf_model = gensim.models.Nmf(
    corpus = corpus,     # Document-Term Matrix
    id2word = id2word,   # Map word IDs to words
    num_topics = 30,     # Number of latent topics to extract
    random_state = 100,
    passes = 100,        # N° of passes through the corpus during training
    )

# Get the topics sorted by sparsity
#nmf_model.show_topics()
In [62]:
nmf_topics = nmf_model.show_topics(num_words=5)

topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

for topic in nmf_topics:
    print(topic)
    topics.append(preprocess_string(topic[1], filters))
print('----------------------------------------------------------------------------------')
[print(a) for a in topics];
(18, '0.040*"claim" + 0.019*"surface" + 0.016*"comprise" + 0.015*"position" + 0.013*"member"')
(24, '0.149*"carbonyl" + 0.120*"phenyl" + 0.056*"urea" + 0.053*"difluoro" + 0.049*"substitute"')
(5, '0.105*"layer" + 0.029*"claim" + 0.022*"material" + 0.020*"electrode" + 0.019*"conductive"')
(6, '0.054*"display" + 0.036*"more" + 0.028*"object" + 0.022*"claim" + 0.021*"input"')
(19, '0.041*"channel" + 0.037*"connect" + 0.031*"gate" + 0.024*"zone" + 0.023*"wireless"')
(8, '0.095*"light" + 0.078*"image" + 0.044*"generation" + 0.044*"range" + 0.035*"execution"')
(3, '0.104*"side" + 0.096*"transistor" + 0.075*"output" + 0.064*"couple" + 0.059*"high"')
(29, '0.081*"terminal" + 0.069*"switch" + 0.048*"controllable" + 0.042*"circuit" + 0.041*"first"')
(11, '0.100*"memory" + 0.078*"say" + 0.039*"copy" + 0.031*"unit" + 0.030*"datum"')
(26, '0.073*"say" + 0.048*"module" + 0.038*"camera" + 0.033*"board" + 0.033*"circuit"')
----------------------------------------------------------------------------------
['claim', 'surface', 'comprise', 'position', 'member']
['carbonyl', 'phenyl', 'urea', 'difluoro', 'substitute']
['layer', 'claim', 'material', 'electrode', 'conductive']
['display', 'more', 'object', 'claim', 'input']
['channel', 'connect', 'gate', 'zone', 'wireless']
['light', 'image', 'generation', 'range', 'execution']
['side', 'transistor', 'output', 'couple', 'high']
['terminal', 'switch', 'controllable', 'circuit', 'first']
['memory', 'say', 'copy', 'unit', 'datum']
['say', 'module', 'camera', 'board', 'circuit']
In [63]:
##################### BERTOPIC ###################################
In [42]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

# Fetch 20newsgropus dataset
docs = docs = list(df_train.loc[:, "claims_text"].values)

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Clustering model: See [2] for more details
cluster_model = HDBSCAN(min_cluster_size = 25, 
                        metric = 'euclidean', 
                        cluster_selection_method = 'eom', 
                        prediction_data = True)

# BERTopic model
Bertopic_model = BERTopic(embedding_model = embedding_model,
                       hdbscan_model = cluster_model)

# Fit the model on a corpus
topics, probs = Bertopic_model.fit_transform(docs)

# Visualization examples

# Save intertopic distance map as HTML file
#topic_model.visualize_topics().write_html("/intertopic_dist_map.html")

# Save topic-terms barcharts as HTML file
#topic_model.visualize_barchart(top_n_topics = 25).write_html("/barchart.html")

# Save documents projection as HTML file
#topic_model.visualize_documents(docs).write_html("/projections.html")

# Save topics dendrogram as HTML file
#topic_model.visualize_hierarchy().write_html("/hieararchy.html")
In [43]:
#intertopic distance map 
Bertopic_model.visualize_topics()
In [73]:
# topic-terms barcharts
#Bertopic_model.visualize_barchart(top_n_topics = 25)
In [44]:
#documents projection
Bertopic_model.visualize_documents(docs)
In [45]:
# topics dendrogram 
Bertopic_model.visualize_hierarchy()
In [80]:
######################### Top2Vec###################################
In [82]:
from top2vec import Top2Vec
from sklearn.datasets import fetch_20newsgroups

docs = docs = list(df_train.loc[:, "claims_text"].values)

# Create jointly embedded topic, document and word vectors
Top2Vec_model = Top2Vec(
  docs, 
  embedding_model = 'doc2vec', # Embedding model: See [1,2] for supported models
  min_count = 50,              # Ignore words less frequent than this value
  umap_args = None,            # Dict of custom args for UMAP
  hdbscan_args = None          # Dict of custom argd for HDBSCAN
  )

# Visualization examples: See [1,2] for more details

# Search the closest 5 topics to the input query "faith"
# topic_words, word_scores, topic_scores, topic_nums = Top2Vec_model.search_topics(
#     keywords = ["faith"], 
#     num_topics = 5)

# Plot the resulting topics as wordclouds
# for topic in topic_nums:
#     topic_model.generate_topic_wordcloud(topic)
2022-12-26 09:47:17,188 - top2vec - INFO - Pre-processing documents for training
2022-12-26 09:47:20,088 - top2vec - INFO - Creating joint document/word embedding
2022-12-26 09:48:38,684 - top2vec - INFO - Creating lower dimension embedding of documents
2022-12-26 09:48:42,781 - top2vec - INFO - Finding dense areas of documents
2022-12-26 09:48:42,830 - top2vec - INFO - Finding topics
In [83]:
topic_words, word_scores, topic_nums = Top2Vec_model.get_topics()
print(topic_words)
[['extending' 'extends' 'longitudinal' 'engaging' 'distal' 'axial' 'body'
  'radially' 'outward' 'spaced' 'away' 'rotate' 'opening' 'circumference'
  'axially' 'extend' 'end' 'inward' 'sides' 'outer' 'outwardly' 'wall'
  'flange' 'threaded' 'assembly' 'mounting' 'spring' 'annular' 'engage'
  'inwardly' 'housing' 'circumferential' 'engages' 'integrally' 'along'
  'shoulder' 'inner' 'bottom' 'apart' 'rear' 'positioned' 'formed'
  'rotatably' 'therebetween' 'fastener' 'disposed' 'inlet' 'aperture'
  'locking' 'outlet']
 ['service' 'readable' 'computer' 'executed' 'request' 'private'
  'security' 'transitory' 'instructions' 'application' 'transaction'
  'session' 'entity' 'client' 'protocol' 'executable' 'key' 'network'
  'message' 'program' 'blockchain' 'services' 'credentials'
  'transactions' 'server' 'code' 'cryptographic' 'user' 'text' 'software'
  'proxy' 'account' 'token' 'file' 'identity' 'implemented' 'requesting'
  'encrypted' 'policy' 'operations' 'record' 'editing' 'applications'
  'computing' 'certificate' 'processor' 'cause' 'processors' 'public'
  'executing']
 ['layer' 'insulating' 'semiconductor' 'dielectric' 'doped' 'substrate'
  'etching' 'bonding' 'nitride' 'exposed' 'conductive' 'conformal'
  'metal' 'material' 'film' 'etch' 'solder' 'silicon' 'manufacturing'
  'thickness' 'disposed' 'surface' 'formed' 'electrode' 'sidewalls'
  'insulation' 'wafer' 'covered' 'surfaces' 'fin' 'top' 'adhesive'
  'emitting' 'electrically' 'gate' 'liner' 'depositing' 'wiring' 'covers'
  'mask' 'protruding' 'middle' 'copper' 'bottom' 'thin' 'stress'
  'metallic' 'package' 'sidewall' 'structure']
 ['voltage' 'circuit' 'amplifier' 'capacitor' 'comparator' 'switch'
  'signal' 'output' 'analog' 'converter' 'resistor' 'transistor'
  'oscillator' 'clock' 'inverter' 'outputs' 'input' 'switching'
  'capacitance' 'voltages' 'differential' 'transistors' 'inductor'
  'reset' 'integration' 'power' 'ac' 'capacitors' 'control' 'switches'
  'drain' 'cascode' 'switched' 'gain' 'alternating' 'cycle' 'pulse'
  'sampling' 'controllable' 'coupled' 'stage' 'current' 'impedance'
  'fast' 'polarity' 'oscillation' 'rf' 'gate' 'nand' 'logic']
 ['composition' 'solvent' 'alkyl' 'carbon' 'atoms' 'formula' 'nh'
  'polymer' 'copolymer' 'methyl' 'hydrocarbon' 'monomer' 'alkenyl'
  'phenyl' 'hydrogen' 'aqueous' 'substituted' 'acid' 'aryl' 'urea'
  'acrylate' 'catalyst' 'xb' 'ch' 'unsubstituted' 'soluble'
  'combinations' 'calcium' 'salt' 'iron' 'compound' 'heteroaryl'
  'titanium' 'mg' 'carbonyl' 'reaction' 'consisting' 'treated' 'meth'
  'resistant' 'yl' 'wt' 'particles' 'coating' 'difluorophenyl' 'solid'
  'independently' 'branched' 'containing' 'mixture']
 ['viewpoint' 'image' 'sight' 'boundary' 'watermark' 'acquired'
  'captured' 'acquiring' 'object' 'images' 'interest' 'obtained' 'values'
  'coordinates' 'swathe' 'represented' 'differences' 'capturing' 'maps'
  'difference' 'pixel' 'executes' 'option' 'correcting' 'variation'
  'representation' 'projected' 'value' 'displaying' 'frames' 'pixels'
  'imaging' 'region' 'distances' 'printing' 'color' 'similarity'
  'processing' 'corrected' 'rough' 'extraction' 'luminance' 'scale'
  'blue' 'correlation' 'acquire' 'points' 'colors' 'gesture' 'projecting']
 ['trajectory' 'vehicle' 'speed' 'road' 'demand' 'velocity'
  'acceleration' 'wheels' 'torque' 'motor' 'brake' 'propulsion'
  'vehicles' 'lane' 'wheel' 'load' 'actuators' 'prescribed' 'actual'
  'condition' 'movement' 'steering' 'electronics' 'autonomous' 'angular'
  'safety' 'travel' 'actuator' 'estimated' 'plunger' 'drive' 'moving'
  'changed' 'gradient' 'golf' 'established' 'changing' 'estimate'
  'flight' 'acquired' 'calculated' 'duration' 'change' 'efficiency'
  'detected' 'increase' 'detects' 'control' 'calculate' 'situation']
 ['lens' 'refractive' 'focal' 'optical' 'incident' 'light' 'reflected'
  'wavelength' 'polarization' 'irradiation' 'beam' 'illumination'
  'crystal' 'emitted' 'nm' 'mirror' 'projection' 'convex' 'imaging'
  'infrared' 'reflection' 'intensity' 'curvature' 'rays' 'imager' 'shape'
  'concave' 'radius' 'index' 'particle' 'view' 'absorber' 'image'
  'contrast' 'perpendicular' 'grating' 'laser' 'plane' 'spot' 'cladding'
  'surface' 'angle' 'visible' 'sequentially' 'ray' 'satisfied'
  'objective' 'capturing' 'detector' 'opaque']
 ['administering' 'seq' 'amino' 'acid' 'nucleic' 'cancer' 'polypeptide'
  'peptide' 'pharmaceutical' 'acids' 'treating' 'protein' 'fragment' 'no'
  'car' 'antibody' 'disease' 'acceptable' 'id' 'need' 'composition' 'dna'
  'effective' 'gene' 'il' 'salt' 'expression' 'consisting' 'variant' 'mg'
  'human' 'subject' 'domain' 'cells' 'yl' 'administration' 'bb'
  'concentration' 'thereof' 'methyl' 'binding' 'dose' 'isolated' 'tissue'
  'compound' 'carbohydrate' 'sequence' 'risk' 'substituted' 'formula']
 ['decoding' 'decoded' 'decoder' 'encoder' 'decode' 'symbol' 'symbols'
  'vector' 'bits' 'transform' 'prediction' 'vectors' 'encoded' 'encoding'
  'values' 'chroma' 'candidate' 'inter' 'format' 'video' 'code' 'word'
  'register' 'block' 'coefficient' 'value' 'stream' 'frames' 'rs'
  'context' 'significant' 'instruction' 'number' 'representation'
  'adaptive' 'indicate' 'bit' 'calculate' 'coefficients' 'representing'
  'pixels' 'integer' 'numeric' 'previously' 'scheme' 'calculating'
  'mapping' 'language' 'transitory' 'using']
 ['downlink' 'ue' 'uplink' 'subframe' 'rrc' 'tdd' 'enb' 'station' 'csi'
  'signaling' 'resources' 'resource' 'ues' 'bs' 'configurations' 'pucch'
  'srs' 'periodicity' 'subcarriers' 'transmission' 'mac' 'rs' 'broadcast'
  'bandwidth' 'serving' 'configuration' 'radio' 'allocation'
  'interference' 'transmit' 'transmitting' 'transceiver' 'wireless'
  'transmitted' 'message' 'shared' 'base' 'procedure' 'priority'
  'network' 'duplex' 'information' 'physical' 'indicated' 'access'
  'channel' 'symbol' 'stations' 'equipment' 'indication']]